import pandas as pd
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
data=pd.read_csv('weatherAUS.csv')
data.shape
(145460, 23)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145460 entries, 0 to 145459 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 145460 non-null object 1 Location 145460 non-null object 2 MinTemp 143975 non-null float64 3 MaxTemp 144199 non-null float64 4 Rainfall 142199 non-null float64 5 Evaporation 82670 non-null float64 6 Sunshine 75625 non-null float64 7 WindGustDir 135134 non-null object 8 WindGustSpeed 135197 non-null float64 9 WindDir9am 134894 non-null object 10 WindDir3pm 141232 non-null object 11 WindSpeed9am 143693 non-null float64 12 WindSpeed3pm 142398 non-null float64 13 Humidity9am 142806 non-null float64 14 Humidity3pm 140953 non-null float64 15 Pressure9am 130395 non-null float64 16 Pressure3pm 130432 non-null float64 17 Cloud9am 89572 non-null float64 18 Cloud3pm 86102 non-null float64 19 Temp9am 143693 non-null float64 20 Temp3pm 141851 non-null float64 21 RainToday 142199 non-null object 22 RainTomorrow 142193 non-null object dtypes: float64(16), object(7) memory usage: 25.5+ MB
#drop raws where RainAToday and RainToday has null values
data.dropna(subset=['RainToday','RainTomorrow'],inplace=True)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size']=14
matplotlib.rcParams['figure.figsize']=(10,6)
matplotlib.rcParams['figure.facecolor']='#00000000'
print("Unique locations and their counts :")
location = data["Location"].value_counts()
location = location.reset_index()
location.columns= ["Location", "Counts"]
print(location)
Unique locations and their counts :
Location Counts
0 Canberra 3402
1 Sydney 3331
2 Perth 3193
3 Darwin 3192
4 Hobart 3183
5 Brisbane 3132
6 Bendigo 3030
7 Townsville 3027
8 AliceSprings 3025
9 MountGambier 3022
10 Launceston 3020
11 Adelaide 3020
12 Ballarat 3017
13 PerthAirport 3009
14 MelbourneAirport 3009
15 Mildura 3005
16 SydneyAirport 3001
17 Albany 2996
18 Nuriootpa 2996
19 Sale 2992
20 Albury 2991
21 Watsonia 2990
22 Woomera 2984
23 Portland 2984
24 Cobar 2980
25 Cairns 2964
26 Tuggeranong 2959
27 WaggaWagga 2958
28 NorfolkIsland 2944
29 Wollongong 2943
30 SalmonGums 2941
31 CoffsHarbour 2940
32 Dartmoor 2939
33 Newcastle 2929
34 Witchcliffe 2924
35 GoldCoast 2924
36 Penrith 2911
37 Richmond 2906
38 NorahHead 2888
39 BadgerysCreek 2877
40 MountGinini 2816
41 Moree 2791
42 Walpole 2770
43 PearceRAAF 2646
44 Williamtown 2376
45 Melbourne 2298
46 Nhil 1565
47 Katherine 1545
48 Uluru 1502
fig = px.treemap(location, path=['Location'], values='Counts', color='Counts', title="Arrangement Of Locations By Counts From Highest To Lowest")
fig.show()
px.histogram(data,x="Location",title="Location vs Rainy Days",color="RainToday")
#data['RainTomorrow'].value_counts()
sns.countplot(x=data['RainTomorrow'])
<Axes: xlabel='RainTomorrow', ylabel='count'>
sns.boxplot(x='RainTomorrow', y='Temp3pm', data=data)
plt.xlabel('Rain')
plt.ylabel('Temp')
plt.show()
Today_Tomo = pd.crosstab(data['RainToday'], data['RainTomorrow'])
Today_Tomo.plot(kind='bar',stacked=False)
plt.xlabel('Rain Today')
plt.ylabel('Count')
plt.title('Rain Today - Rain Tomorrow')
Text(0.5, 1.0, 'Rain Today - Rain Tomorrow')
px.strip(data.sample(2000),title='Temp 3pm vs Humidity 3 pm',x='Temp3pm',y="Humidity3pm",color="RainTomorrow")
px.histogram(data,x='Temp3pm',title="Temperature at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
px.histogram(data,x='Humidity3pm',title="Humidity at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
Today_Tomo = pd.crosstab(data['Cloud3pm'], data['RainTomorrow'])
row_sums = Today_Tomo.sum(axis=1)
Today_Tomo_percentage = Today_Tomo.div(row_sums, axis=0) * 100
Today_Tomo_percentage.plot(kind='bar', stacked=True)
plt.xlabel('Rain Today')
plt.ylabel('Percentage')
plt.title('Rain Today - Rain Tomorrow (100% Stacked)')
# Show the graph
plt.show()
px.histogram(data,x='Sunshine',title="Sunny Hours vs. Rain Tomorrow",color='RainTomorrow')
px.histogram(data,x='Pressure3pm',title="Pressure at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
#px.histogram(data,x='WindGustSpeed',title="Pressure at 3 pm vs. Rain Tomorrow",color='RainTomorrow')
Today_Tomo = pd.crosstab(data['WindGustSpeed'], data['RainTomorrow'])
row_sums = Today_Tomo.sum(axis=1)
Today_Tomo_percentage = Today_Tomo.div(row_sums, axis=0) * 100
Today_Tomo_percentage.plot(kind='bar', stacked=True)
plt.xlabel('WindGustSpeed')
plt.ylabel('Percentage')
plt.title('WindGustSpeed - Rain Tomorrow (100% Stacked)')
# Show the graph
plt.show()
sns.boxplot(x='RainTomorrow', y='WindGustSpeed', data=data)
plt.xlabel('Rain')
plt.ylabel('Temp')
plt.show()
correlation = data.corr()
plt.figure(figsize=(16,12))
plt.title('Correlation Heatmap of Rain in Australia Dataset')
ax = sns.heatmap(correlation, square=True, annot=True, fmt='.2f', linecolor='white')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=30)
plt.show()
C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\4212842004.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
plt.figure(figsize=(10,10))
sns.heatmap(correlation[(correlation>0.6) |(correlation<-0.6)],annot=True,fmt='.2f')
<Axes: >
sns.boxplot(x='RainTomorrow', y='MaxTemp', data=data)
plt.xlabel('Rain')
plt.ylabel('Temp')
plt.show()
fig, ax = plt.subplots(4, 2, figsize=(15,25))
# WindSpeed9am
sns.distplot(data['WindSpeed9am'], ax=ax[0,0], color='green')
ax[0,0].set_title("Wind Speed at 9AM", fontsize=15)
# WindSpeed3pm
sns.distplot(data['WindSpeed3pm'], ax=ax[0,1], color='green')
ax[0,1].set_title("Wind Speed at 3PM", fontsize=15)
# Humidity9am
sns.distplot(data['Humidity9am'], ax=ax[1,0], color='orange')
ax[1,0].set_title("Humidity at 9AM", fontsize=15)
# Humidity3pm
sns.distplot(data['Humidity3pm'], ax=ax[1,1], color='orange')
ax[1,1].set_title("Humidity at 3PM", fontsize=15)
# Pressure9am
sns.distplot(data['Pressure9am'], ax=ax[2,0], color='red')
ax[2,0].set_title("Pressure at 9AM", fontsize=15)
# Pressure3pm
sns.distplot(data['Pressure3pm'], ax=ax[2,1], color='red')
ax[2,1].set_title("Pressure at 3PM", fontsize=15)
# Temp9am
sns.distplot(data['Temp9am'], ax=ax[3,0], color='blue')
ax[3,0].set_title("Temperature at 9AM", fontsize=15)
# Temp3pm
sns.distplot(data['Temp3pm'], ax=ax[3,1], color='blue')
ax[3,1].set_title("Temperature at 3PM", fontsize=15)
C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:8: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:12: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:16: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:20: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:24: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:28: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 C:\Users\Vibuzz\AppData\Local\Temp\ipykernel_16836\460462655.py:32: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
Text(0.5, 1.0, 'Temperature at 3PM')
from sklearn.model_selection import train_test_split
train_val_df,test_df=train_test_split(data,test_size=0.2,random_state=50)
train_df,val_df=train_test_split(train_val_df,test_size=0.25,random_state=50)
print ( 'train :',train_df.shape)
print ( 'test :',test_df.shape)
print ( 'val :',val_df.shape)
train : (84471, 23) test : (28158, 23) val : (28158, 23)
plt.title('No. of Rows Per Year')
sns.countplot(x=pd.to_datetime(data.Date).dt.year)
<Axes: title={'center': 'No. of Rows Per Year'}, xlabel='Date', ylabel='count'>
plt.title('No. of Rows Per Year')
sns.countplot(x=pd.to_datetime(data.Date).dt.month)
<Axes: title={'center': 'No. of Rows Per Year'}, xlabel='Date', ylabel='count'>